In [1]:
import pandas as pd
import numpy as np
import re
# random forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# tree viz
from sklearn import tree
from sklearn.tree import _tree
import pydotplus
from IPython.display import Image
from os import system
In [2]:
# lookup for the feature names
features = ['edibility', 'cap shape', 'cap surface', 'cap color', 'bruise', 'odor',
'gill attachment', 'gill spacing', 'gill size', 'gill color', 'stalk shape',
'stalk root', 'stalk surface above ring', 'stalk surface below ring',
'stalk color above ring', 'stalk color below ring', 'veil type', 'veil color',
'ring number', 'ring type', 'spore print color', 'population', 'habitat']
# lookup for the feature values
abbrevs = {
1: 'bell=b,conical=c,convex=x,flat=f,knobbed=k,sunken=s',
2: 'fibrous=f,grooves=g,scaly=y,smooth=s',
3: 'brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y',
4: 'yes=t,no=f',
5: 'almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s',
6: 'attached=a,descending=d,free=f,notched=n',
7: 'close=c,crowded=w,distant=d',
8: 'broad=b,narrow=n',
9: 'black=k,brown=n,buff=b,chocolate=h,gray=g,green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y',
10:'enlarging=e,tapering=t',
11:'bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?',
12:'fibrous=f,scaly=y,silky=k,smooth=s',
13:'fibrous=f,scaly=y,silky=k,smooth=s',
14:'brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y',
15:'brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y',
16:'partial=p,universal=u',
17:'brown=n,orange=o,white=w,yellow=y',
18:'none=n,one=o,two=t',
19:'cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z',
20:'black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y',
21:'abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y',
22:'grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d'
}
def create_mapping(x):
mapping = x.split('=')
return (mapping[1], mapping[0])
def get_abbrev_lookups(abbrevs):
abbrev_lookups = {}
for idx in abbrevs:
abbrev = abbrevs[idx].split(',')
abbrev = [create_mapping(mapping) for mapping in abbrev]
abbrev = {key: value for (key, value) in abbrev}
abbrev_lookups[idx] = abbrev
return abbrev_lookups
abbrev_lookups = get_abbrev_lookups(abbrevs)
Utility functions
In [3]:
# utilitiy function to print out the accuracy for the predictions
def print_accuracy_report(y_test, y_pred, print_accuracy=True, print_cm=True, print_cr=True):
if print_accuracy:
print("="*40)
print('accuracy: {:.4f}'.format(accuracy_score(y_test, y_pred)))
if print_cm:
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, columns = ['predicted - poisonous', 'predicted - edible'])
df_cm.index=['actual - poisonous', 'actual - edible']
print("="*40)
print("confusion matrix")
print(df_cm)
if print_cr:
print("="*40)
print("classification report")
print(classification_report(y_test, y_pred,
target_names=['actual - poisonous', 'actual - edible']))
Data loading & pre-processing
In [4]:
# data loading
df = pd.read_csv('./data/agaricus-lepiota.data.csv',
header=None, names=features)
# update the values of the cell from abbreviations to meaningful texts
omit_idx = 1
for i in range(omit_idx, df.shape[1]):
df.iloc[:, i] = df.iloc[:, i].map(lambda x: abbrev_lookups[i][x])
# recoding categorical variables into one hot encoding
# create feature sets and labels
X = pd.get_dummies(df.iloc[:,omit_idx:])
y = df['edibility'].map(lambda x: 0 if x == 'p' else 1)
# creat training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=424242)
# sanity check
# check how many mushrooms are edible, looks like the classes are pretty balanced
print('percentage of edible mushrooms: {:.2f}%'.format(y.mean()*100))
Train a random forest classfier
In [5]:
# build a classifier
clf = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=328919475)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# check the accuracy
print_accuracy_report(y_test, y_pred)
Important features
In [6]:
# top 5 important features
feature_weights = sorted(list(zip(X.columns, clf.feature_importances_)),
key=lambda x:x[1], reverse=True)
feature_weights[:5]
Out[6]:
Tree visualization
In [7]:
# take a look at a tree directly from Jupyter notebook
idx = 1
dot_data = tree.export_graphviz(clf.estimators_[idx], out_file=None,
feature_names=X.columns,
class_names=['poisonous', 'edible'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
Out[7]:
In [8]:
# export all trees to png or json
def export_json(decision_tree, filename, feature_names=None):
"""
Code adapted from Peter Prettenhofer at http://bl.ocks.org/pprett/3813537
Export a decision tree in JSON format.
This function generates a JSON representation of the decision tree,
which is then written into `out_file`. Once exported, graphical renderings
can be generated using, for example::
$ dot -Tps tree.dot -o tree.ps (PostScript format)
$ dot -Tpng tree.dot -o tree.png (PNG format)
Parameters
----------
decision_tree : decision tree classifier
The decision tree to be exported to JSON.
out : file object or string, optional (default=None)
Handle or name of the output file.
feature_names : list of strings, optional (default=None)
Names of each of the features.
Returns
-------
out_file : file object
The file object to which the tree was exported. The user is
expected to `close()` this object when done with it.
Examples
--------
>>> from sklearn.datasets import load_iris
>>> from sklearn import tree
>>> clf = tree.DecisionTreeClassifier()
>>> iris = load_iris()
>>> clf = clf.fit(iris.data, iris.target)
>>> import tempfile
>>> out_file = tree.export_json(clf, out_file=tempfile.TemporaryFile())
>>> out_file.close()
"""
def arr_to_py(arr):
arr = arr.ravel()
wrapper = float
if np.issubdtype(arr.dtype, np.int):
wrapper = int
return list(map(wrapper, arr.tolist()))
def get_class(tree, node_id):
p, e = tree.value[node_id][0][0], tree.value[node_id][0][1]
return 'poisonous' if p > e else 'edible'
def parse_feature(feature):
m = re.match('(.*)_(.*)', feature)
return 'Is ' + m.group(1) + ' ' + m.group(2) + '?'
def node_to_str(tree, node_id, node_type):
"""
node_type: 0: root, 1: left child, 2: right child
"""
node_repr = '"error": %.4f, "samples": %d, "value": %s' \
% (tree.impurity[node_id],
tree.n_node_samples[node_id],
arr_to_py(tree.value[node_id][0]))
label = '' if node_type == 0 else ('no -> ' if node_type == 1 else 'yes -> ')
if tree.children_left[node_id] != _tree.TREE_LEAF:
if feature_names is not None:
feature = feature_names[tree.feature[node_id]]
else:
feature = "X[%s]" % tree.feature[node_id]
label = '"name": "'+ label +'%s"' % (parse_feature(feature))
# label = '"name": '+ label +'"%s <= %.2f"' % (feature, tree.threshold[node_id])
node_type = '"type": "split"'
else:
node_type = '"type": "leaf"'
label = '"name": "%s"' % get_class(tree, node_id)
node_repr = ", ".join((node_repr, label, node_type))
# print(node_repr)
return node_repr
def recurse(tree, node_id, node_type, parent=None):
if node_id == _tree.TREE_LEAF:
raise ValueError("Invalid node_id %s" % _tree.TREE_LEAF)
left_child = tree.children_left[node_id]
right_child = tree.children_right[node_id]
# Open node with description
out_file.write('{%s' % node_to_str(tree, node_id, node_type))
# write children
if left_child != _tree.TREE_LEAF: # and right_child != _tree.TREE_LEAF
out_file.write(', "children": [')
recurse(tree, left_child, 1, node_id)
out_file.write(', ')
recurse(tree, right_child, 2, node_id)
out_file.write(']')
# close node
out_file.write('}')
with open(filename, 'w') as out_file:
if isinstance(decision_tree, _tree.Tree):
recurse(decision_tree, 0, 0)
else:
recurse(decision_tree.tree_, 0, 0)
def export_trees(export_type, feature_names):
for idx, clf_dt in enumerate(clf.estimators_):
if export_type == 'png':
# tree viz
filename = 'tree' + str(idx) + '.png'
tree.export_graphviz(clf_dt, out_file='tree.dot',
feature_names=feature_names,
class_names=['poisonous', 'edible'],
filled=True, rounded=True, impurity=False,
special_characters=True)
system('dot -Tpng tree.dot -o ' + filename)
elif export_type == 'json':
filename = 'tree' + str(idx) + '.json'
export_json(clf_dt, filename, feature_names=feature_names)
In [9]:
export_trees('json', X.columns)
In [10]:
type(clf.estimators_[0].tree_)
# clf.estimators_[0].tree_.max_n_classes
Out[10]: